{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import MinMaxScaler\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>-1.0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>-0.5</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.0</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1.0</td>\n",
       "      <td>18</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     0   1\n",
       "0 -1.0   2\n",
       "1 -0.5   6\n",
       "2  0.0  10\n",
       "3  1.0  18"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]\n",
    "#不太熟悉numpy的小伙伴，能够判断data的结构吗？\n",
    "#如果换成表是什么样子？\n",
    "\n",
    "pd.DataFrame(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0.  , 0.  ],\n",
       "       [0.25, 0.25],\n",
       "       [0.5 , 0.5 ],\n",
       "       [1.  , 1.  ]])"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#实现归一化\n",
    "scaler = MinMaxScaler() #实例化\n",
    "scaler = scaler.fit(data) #fit，在这里本质是生成min(x)和max(x)\n",
    "result = scaler.transform(data) #通过接口导出结果\n",
    "result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "result_ = scaler.fit_transform(data) #训练和导出结果一步达成\n",
    "scaler.inverse_transform(result) #将归一化后的结果逆转\n",
    "#使用MinMaxScaler的参数feature_range实现将数据归一化到[0,1]以外的范围中\n",
    "data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]\n",
    "scaler = MinMaxScaler(feature_range=[5,10]) #依然实例化，指定结果范围"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 5.  ,  5.  ],\n",
       "       [ 6.25,  6.25],\n",
       "       [ 7.5 ,  7.5 ],\n",
       "       [10.  , 10.  ]])"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "result = scaler.fit_transform(data)\n",
    "result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "StandardScaler(copy=True, with_mean=True, with_std=True)"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 下边进行标准化\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]\n",
    "scaler = StandardScaler()\n",
    "scaler.fit(data)  # 通过fit计算均值，方差时，会将nan丢弃后计算"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([-0.125,  9.   ])"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "scaler.mean_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 0.546875, 35.      ])"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "scaler.var_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[-1.18321596, -1.18321596],\n",
       "       [-0.50709255, -0.50709255],\n",
       "       [ 0.16903085,  0.16903085],\n",
       "       [ 1.52127766,  1.52127766]])"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x_std = scaler.transform(data)  # 保持nan的状态显示，如果有nan的话\n",
    "x_std"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.0"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x_std.mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1.0"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x_std.std()  # 数组求标准差"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1.0"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "x_std.var()  # 数组求方差"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Age</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Embarked</th>\n",
       "      <th>Survived</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>22.0</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>38.0</td>\n",
       "      <td>female</td>\n",
       "      <td>C</td>\n",
       "      <td>Yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>26.0</td>\n",
       "      <td>female</td>\n",
       "      <td>S</td>\n",
       "      <td>Yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>35.0</td>\n",
       "      <td>female</td>\n",
       "      <td>S</td>\n",
       "      <td>Yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>35.0</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>NaN</td>\n",
       "      <td>male</td>\n",
       "      <td>Q</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>54.0</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>2.0</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>27.0</td>\n",
       "      <td>female</td>\n",
       "      <td>S</td>\n",
       "      <td>Yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>14.0</td>\n",
       "      <td>female</td>\n",
       "      <td>C</td>\n",
       "      <td>Yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>4.0</td>\n",
       "      <td>female</td>\n",
       "      <td>S</td>\n",
       "      <td>Unknown</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>58.0</td>\n",
       "      <td>female</td>\n",
       "      <td>S</td>\n",
       "      <td>Yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>20.0</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>39.0</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>14.0</td>\n",
       "      <td>female</td>\n",
       "      <td>S</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>55.0</td>\n",
       "      <td>female</td>\n",
       "      <td>S</td>\n",
       "      <td>Unknown</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>2.0</td>\n",
       "      <td>male</td>\n",
       "      <td>Q</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>NaN</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>Yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>31.0</td>\n",
       "      <td>female</td>\n",
       "      <td>S</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>NaN</td>\n",
       "      <td>female</td>\n",
       "      <td>C</td>\n",
       "      <td>Yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>35.0</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>Unknown</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>34.0</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>Yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>15.0</td>\n",
       "      <td>female</td>\n",
       "      <td>Q</td>\n",
       "      <td>Yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>28.0</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>Yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>8.0</td>\n",
       "      <td>female</td>\n",
       "      <td>S</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>38.0</td>\n",
       "      <td>female</td>\n",
       "      <td>S</td>\n",
       "      <td>Unknown</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>NaN</td>\n",
       "      <td>male</td>\n",
       "      <td>C</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>19.0</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>NaN</td>\n",
       "      <td>female</td>\n",
       "      <td>Q</td>\n",
       "      <td>Yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>NaN</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>861</th>\n",
       "      <td>21.0</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>862</th>\n",
       "      <td>48.0</td>\n",
       "      <td>female</td>\n",
       "      <td>S</td>\n",
       "      <td>Yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>863</th>\n",
       "      <td>NaN</td>\n",
       "      <td>female</td>\n",
       "      <td>S</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>864</th>\n",
       "      <td>24.0</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>Unknown</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>865</th>\n",
       "      <td>42.0</td>\n",
       "      <td>female</td>\n",
       "      <td>S</td>\n",
       "      <td>Unknown</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>866</th>\n",
       "      <td>27.0</td>\n",
       "      <td>female</td>\n",
       "      <td>C</td>\n",
       "      <td>Unknown</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>867</th>\n",
       "      <td>31.0</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>868</th>\n",
       "      <td>NaN</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>869</th>\n",
       "      <td>4.0</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>Yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>870</th>\n",
       "      <td>26.0</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>871</th>\n",
       "      <td>47.0</td>\n",
       "      <td>female</td>\n",
       "      <td>S</td>\n",
       "      <td>Unknown</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>872</th>\n",
       "      <td>33.0</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>873</th>\n",
       "      <td>47.0</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>874</th>\n",
       "      <td>28.0</td>\n",
       "      <td>female</td>\n",
       "      <td>C</td>\n",
       "      <td>Yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>875</th>\n",
       "      <td>15.0</td>\n",
       "      <td>female</td>\n",
       "      <td>C</td>\n",
       "      <td>Yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>876</th>\n",
       "      <td>20.0</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>877</th>\n",
       "      <td>19.0</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>878</th>\n",
       "      <td>NaN</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>879</th>\n",
       "      <td>56.0</td>\n",
       "      <td>female</td>\n",
       "      <td>C</td>\n",
       "      <td>Yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>880</th>\n",
       "      <td>25.0</td>\n",
       "      <td>female</td>\n",
       "      <td>S</td>\n",
       "      <td>Yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>881</th>\n",
       "      <td>33.0</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>882</th>\n",
       "      <td>22.0</td>\n",
       "      <td>female</td>\n",
       "      <td>S</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>883</th>\n",
       "      <td>28.0</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>Unknown</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>884</th>\n",
       "      <td>25.0</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>885</th>\n",
       "      <td>39.0</td>\n",
       "      <td>female</td>\n",
       "      <td>Q</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>886</th>\n",
       "      <td>27.0</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>887</th>\n",
       "      <td>19.0</td>\n",
       "      <td>female</td>\n",
       "      <td>S</td>\n",
       "      <td>Yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>888</th>\n",
       "      <td>NaN</td>\n",
       "      <td>female</td>\n",
       "      <td>S</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>889</th>\n",
       "      <td>26.0</td>\n",
       "      <td>male</td>\n",
       "      <td>C</td>\n",
       "      <td>Unknown</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>890</th>\n",
       "      <td>32.0</td>\n",
       "      <td>male</td>\n",
       "      <td>Q</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>891 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      Age     Sex Embarked Survived\n",
       "0    22.0    male        S       No\n",
       "1    38.0  female        C      Yes\n",
       "2    26.0  female        S      Yes\n",
       "3    35.0  female        S      Yes\n",
       "4    35.0    male        S       No\n",
       "5     NaN    male        Q       No\n",
       "6    54.0    male        S       No\n",
       "7     2.0    male        S       No\n",
       "8    27.0  female        S      Yes\n",
       "9    14.0  female        C      Yes\n",
       "10    4.0  female        S  Unknown\n",
       "11   58.0  female        S      Yes\n",
       "12   20.0    male        S       No\n",
       "13   39.0    male        S       No\n",
       "14   14.0  female        S       No\n",
       "15   55.0  female        S  Unknown\n",
       "16    2.0    male        Q       No\n",
       "17    NaN    male        S      Yes\n",
       "18   31.0  female        S       No\n",
       "19    NaN  female        C      Yes\n",
       "20   35.0    male        S  Unknown\n",
       "21   34.0    male        S      Yes\n",
       "22   15.0  female        Q      Yes\n",
       "23   28.0    male        S      Yes\n",
       "24    8.0  female        S       No\n",
       "25   38.0  female        S  Unknown\n",
       "26    NaN    male        C       No\n",
       "27   19.0    male        S       No\n",
       "28    NaN  female        Q      Yes\n",
       "29    NaN    male        S       No\n",
       "..    ...     ...      ...      ...\n",
       "861  21.0    male        S       No\n",
       "862  48.0  female        S      Yes\n",
       "863   NaN  female        S       No\n",
       "864  24.0    male        S  Unknown\n",
       "865  42.0  female        S  Unknown\n",
       "866  27.0  female        C  Unknown\n",
       "867  31.0    male        S       No\n",
       "868   NaN    male        S       No\n",
       "869   4.0    male        S      Yes\n",
       "870  26.0    male        S       No\n",
       "871  47.0  female        S  Unknown\n",
       "872  33.0    male        S       No\n",
       "873  47.0    male        S       No\n",
       "874  28.0  female        C      Yes\n",
       "875  15.0  female        C      Yes\n",
       "876  20.0    male        S       No\n",
       "877  19.0    male        S       No\n",
       "878   NaN    male        S       No\n",
       "879  56.0  female        C      Yes\n",
       "880  25.0  female        S      Yes\n",
       "881  33.0    male        S       No\n",
       "882  22.0  female        S       No\n",
       "883  28.0    male        S  Unknown\n",
       "884  25.0    male        S       No\n",
       "885  39.0  female        Q       No\n",
       "886  27.0    male        S       No\n",
       "887  19.0  female        S      Yes\n",
       "888   NaN  female        S       No\n",
       "889  26.0    male        C  Unknown\n",
       "890  32.0    male        Q       No\n",
       "\n",
       "[891 rows x 4 columns]"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 缺失值的处理\n",
    "data = pd.read_csv('./Narrativedata.csv',index_col=0)  # index_col=0表示不要将index作为新的一列出现\n",
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.impute import SimpleImputer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "pandas.core.series.Series"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# age列的缺失值，使用pandas填充即可，无需使用sklearn的包，应该是需要高版本的sklearn才有此包\n",
    "type(data.loc[:,'Age'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.loc[:,'Age'].fillna(data.loc[:,'Age'].mean(), inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Age</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Embarked</th>\n",
       "      <th>Survived</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>22.000000</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>38.000000</td>\n",
       "      <td>female</td>\n",
       "      <td>C</td>\n",
       "      <td>Yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>26.000000</td>\n",
       "      <td>female</td>\n",
       "      <td>S</td>\n",
       "      <td>Yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>35.000000</td>\n",
       "      <td>female</td>\n",
       "      <td>S</td>\n",
       "      <td>Yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>35.000000</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>29.699118</td>\n",
       "      <td>male</td>\n",
       "      <td>Q</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>54.000000</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>2.000000</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>27.000000</td>\n",
       "      <td>female</td>\n",
       "      <td>S</td>\n",
       "      <td>Yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>14.000000</td>\n",
       "      <td>female</td>\n",
       "      <td>C</td>\n",
       "      <td>Yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>4.000000</td>\n",
       "      <td>female</td>\n",
       "      <td>S</td>\n",
       "      <td>Unknown</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>58.000000</td>\n",
       "      <td>female</td>\n",
       "      <td>S</td>\n",
       "      <td>Yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>20.000000</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>39.000000</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>14.000000</td>\n",
       "      <td>female</td>\n",
       "      <td>S</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>55.000000</td>\n",
       "      <td>female</td>\n",
       "      <td>S</td>\n",
       "      <td>Unknown</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>2.000000</td>\n",
       "      <td>male</td>\n",
       "      <td>Q</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>29.699118</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>Yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>31.000000</td>\n",
       "      <td>female</td>\n",
       "      <td>S</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>29.699118</td>\n",
       "      <td>female</td>\n",
       "      <td>C</td>\n",
       "      <td>Yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>35.000000</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>Unknown</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>34.000000</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>Yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>15.000000</td>\n",
       "      <td>female</td>\n",
       "      <td>Q</td>\n",
       "      <td>Yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>28.000000</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>Yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>8.000000</td>\n",
       "      <td>female</td>\n",
       "      <td>S</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>38.000000</td>\n",
       "      <td>female</td>\n",
       "      <td>S</td>\n",
       "      <td>Unknown</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>29.699118</td>\n",
       "      <td>male</td>\n",
       "      <td>C</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>19.000000</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>29.699118</td>\n",
       "      <td>female</td>\n",
       "      <td>Q</td>\n",
       "      <td>Yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>29.699118</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>861</th>\n",
       "      <td>21.000000</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>862</th>\n",
       "      <td>48.000000</td>\n",
       "      <td>female</td>\n",
       "      <td>S</td>\n",
       "      <td>Yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>863</th>\n",
       "      <td>29.699118</td>\n",
       "      <td>female</td>\n",
       "      <td>S</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>864</th>\n",
       "      <td>24.000000</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>Unknown</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>865</th>\n",
       "      <td>42.000000</td>\n",
       "      <td>female</td>\n",
       "      <td>S</td>\n",
       "      <td>Unknown</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>866</th>\n",
       "      <td>27.000000</td>\n",
       "      <td>female</td>\n",
       "      <td>C</td>\n",
       "      <td>Unknown</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>867</th>\n",
       "      <td>31.000000</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>868</th>\n",
       "      <td>29.699118</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>869</th>\n",
       "      <td>4.000000</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>Yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>870</th>\n",
       "      <td>26.000000</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>871</th>\n",
       "      <td>47.000000</td>\n",
       "      <td>female</td>\n",
       "      <td>S</td>\n",
       "      <td>Unknown</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>872</th>\n",
       "      <td>33.000000</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>873</th>\n",
       "      <td>47.000000</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>874</th>\n",
       "      <td>28.000000</td>\n",
       "      <td>female</td>\n",
       "      <td>C</td>\n",
       "      <td>Yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>875</th>\n",
       "      <td>15.000000</td>\n",
       "      <td>female</td>\n",
       "      <td>C</td>\n",
       "      <td>Yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>876</th>\n",
       "      <td>20.000000</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>877</th>\n",
       "      <td>19.000000</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>878</th>\n",
       "      <td>29.699118</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>879</th>\n",
       "      <td>56.000000</td>\n",
       "      <td>female</td>\n",
       "      <td>C</td>\n",
       "      <td>Yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>880</th>\n",
       "      <td>25.000000</td>\n",
       "      <td>female</td>\n",
       "      <td>S</td>\n",
       "      <td>Yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>881</th>\n",
       "      <td>33.000000</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>882</th>\n",
       "      <td>22.000000</td>\n",
       "      <td>female</td>\n",
       "      <td>S</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>883</th>\n",
       "      <td>28.000000</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>Unknown</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>884</th>\n",
       "      <td>25.000000</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>885</th>\n",
       "      <td>39.000000</td>\n",
       "      <td>female</td>\n",
       "      <td>Q</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>886</th>\n",
       "      <td>27.000000</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>887</th>\n",
       "      <td>19.000000</td>\n",
       "      <td>female</td>\n",
       "      <td>S</td>\n",
       "      <td>Yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>888</th>\n",
       "      <td>29.699118</td>\n",
       "      <td>female</td>\n",
       "      <td>S</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>889</th>\n",
       "      <td>26.000000</td>\n",
       "      <td>male</td>\n",
       "      <td>C</td>\n",
       "      <td>Unknown</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>890</th>\n",
       "      <td>32.000000</td>\n",
       "      <td>male</td>\n",
       "      <td>Q</td>\n",
       "      <td>No</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>891 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "           Age     Sex Embarked Survived\n",
       "0    22.000000    male        S       No\n",
       "1    38.000000  female        C      Yes\n",
       "2    26.000000  female        S      Yes\n",
       "3    35.000000  female        S      Yes\n",
       "4    35.000000    male        S       No\n",
       "5    29.699118    male        Q       No\n",
       "6    54.000000    male        S       No\n",
       "7     2.000000    male        S       No\n",
       "8    27.000000  female        S      Yes\n",
       "9    14.000000  female        C      Yes\n",
       "10    4.000000  female        S  Unknown\n",
       "11   58.000000  female        S      Yes\n",
       "12   20.000000    male        S       No\n",
       "13   39.000000    male        S       No\n",
       "14   14.000000  female        S       No\n",
       "15   55.000000  female        S  Unknown\n",
       "16    2.000000    male        Q       No\n",
       "17   29.699118    male        S      Yes\n",
       "18   31.000000  female        S       No\n",
       "19   29.699118  female        C      Yes\n",
       "20   35.000000    male        S  Unknown\n",
       "21   34.000000    male        S      Yes\n",
       "22   15.000000  female        Q      Yes\n",
       "23   28.000000    male        S      Yes\n",
       "24    8.000000  female        S       No\n",
       "25   38.000000  female        S  Unknown\n",
       "26   29.699118    male        C       No\n",
       "27   19.000000    male        S       No\n",
       "28   29.699118  female        Q      Yes\n",
       "29   29.699118    male        S       No\n",
       "..         ...     ...      ...      ...\n",
       "861  21.000000    male        S       No\n",
       "862  48.000000  female        S      Yes\n",
       "863  29.699118  female        S       No\n",
       "864  24.000000    male        S  Unknown\n",
       "865  42.000000  female        S  Unknown\n",
       "866  27.000000  female        C  Unknown\n",
       "867  31.000000    male        S       No\n",
       "868  29.699118    male        S       No\n",
       "869   4.000000    male        S      Yes\n",
       "870  26.000000    male        S       No\n",
       "871  47.000000  female        S  Unknown\n",
       "872  33.000000    male        S       No\n",
       "873  47.000000    male        S       No\n",
       "874  28.000000  female        C      Yes\n",
       "875  15.000000  female        C      Yes\n",
       "876  20.000000    male        S       No\n",
       "877  19.000000    male        S       No\n",
       "878  29.699118    male        S       No\n",
       "879  56.000000  female        C      Yes\n",
       "880  25.000000  female        S      Yes\n",
       "881  33.000000    male        S       No\n",
       "882  22.000000  female        S       No\n",
       "883  28.000000    male        S  Unknown\n",
       "884  25.000000    male        S       No\n",
       "885  39.000000  female        Q       No\n",
       "886  27.000000    male        S       No\n",
       "887  19.000000  female        S      Yes\n",
       "888  29.699118  female        S       No\n",
       "889  26.000000    male        C  Unknown\n",
       "890  32.000000    male        Q       No\n",
       "\n",
       "[891 rows x 4 columns]"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 目标值转换为数值型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 编码与哑变量，将文字型数据转换为数值型。\n",
    "from sklearn.preprocessing import LabelEncoder  # 将目标值的文本分类转为数值，感觉没多大用途"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0           No\n",
       "1          Yes\n",
       "2          Yes\n",
       "3          Yes\n",
       "4           No\n",
       "5           No\n",
       "6           No\n",
       "7           No\n",
       "8          Yes\n",
       "9          Yes\n",
       "10     Unknown\n",
       "11         Yes\n",
       "12          No\n",
       "13          No\n",
       "14          No\n",
       "15     Unknown\n",
       "16          No\n",
       "17         Yes\n",
       "18          No\n",
       "19         Yes\n",
       "20     Unknown\n",
       "21         Yes\n",
       "22         Yes\n",
       "23         Yes\n",
       "24          No\n",
       "25     Unknown\n",
       "26          No\n",
       "27          No\n",
       "28         Yes\n",
       "29          No\n",
       "        ...   \n",
       "861         No\n",
       "862        Yes\n",
       "863         No\n",
       "864    Unknown\n",
       "865    Unknown\n",
       "866    Unknown\n",
       "867         No\n",
       "868         No\n",
       "869        Yes\n",
       "870         No\n",
       "871    Unknown\n",
       "872         No\n",
       "873         No\n",
       "874        Yes\n",
       "875        Yes\n",
       "876         No\n",
       "877         No\n",
       "878         No\n",
       "879        Yes\n",
       "880        Yes\n",
       "881         No\n",
       "882         No\n",
       "883    Unknown\n",
       "884         No\n",
       "885         No\n",
       "886         No\n",
       "887        Yes\n",
       "888         No\n",
       "889    Unknown\n",
       "890         No\n",
       "Name: Survived, Length: 891, dtype: object"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "le = LabelEncoder()\n",
    "y = data.loc[:, 'Survived']\n",
    "y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0, 2, 2, 2, 0, 0, 0, 0, 2, 2, 1, 2, 0, 0, 0, 1, 0, 2, 0, 2, 1, 2,\n",
       "       2, 2, 0, 1, 0, 0, 2, 0, 0, 2, 2, 0, 0, 0, 2, 0, 0, 2, 0, 0, 0, 1,\n",
       "       2, 0, 0, 2, 0, 0, 0, 0, 2, 2, 0, 2, 2, 0, 2, 0, 0, 2, 0, 0, 0, 2,\n",
       "       2, 0, 2, 0, 0, 0, 0, 0, 2, 1, 0, 1, 2, 2, 0, 2, 2, 0, 2, 2, 0, 0,\n",
       "       2, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 2,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 2, 2, 0, 1, 0,\n",
       "       0, 2, 0, 0, 2, 0, 0, 0, 1, 1, 2, 0, 0, 0, 2, 0, 0, 1, 0, 1, 1, 0,\n",
       "       0, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 2, 2, 0, 0, 0, 1, 0, 2, 0, 0, 0,\n",
       "       1, 0, 0, 0, 0, 0, 0, 1, 2, 0, 2, 2, 0, 0, 2, 0, 2, 1, 2, 2, 0, 0,\n",
       "       1, 0, 0, 0, 0, 0, 2, 0, 0, 2, 2, 2, 1, 2, 1, 0, 0, 2, 2, 0, 2, 0,\n",
       "       2, 0, 0, 0, 2, 0, 2, 0, 0, 0, 2, 1, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2,\n",
       "       0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 1, 0, 2, 2, 2, 2, 2, 0, 2, 0, 1,\n",
       "       0, 0, 1, 2, 2, 1, 0, 2, 2, 0, 2, 2, 0, 0, 1, 1, 0, 0, 0, 2, 0, 0,\n",
       "       2, 0, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 1, 2, 0, 2, 2, 2,\n",
       "       0, 2, 2, 2, 0, 0, 0, 2, 2, 1, 2, 2, 0, 1, 2, 2, 0, 2, 0, 1, 2, 2,\n",
       "       2, 0, 1, 0, 2, 0, 0, 2, 1, 0, 2, 2, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0,\n",
       "       0, 0, 0, 0, 2, 0, 2, 2, 0, 0, 0, 0, 0, 0, 2, 2, 2, 1, 2, 0, 0, 0,\n",
       "       0, 2, 2, 0, 0, 0, 2, 2, 0, 2, 0, 0, 0, 2, 0, 2, 2, 2, 0, 2, 2, 0,\n",
       "       0, 0, 0, 2, 2, 1, 0, 0, 0, 1, 0, 2, 0, 0, 0, 0, 2, 0, 2, 0, 1, 2,\n",
       "       0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 2, 2, 2, 2, 0, 0, 2, 0, 2, 0, 0,\n",
       "       2, 0, 0, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 2, 1, 2, 0, 2, 2, 0, 2, 1,\n",
       "       0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 2, 2, 0, 0, 1, 0, 0, 2, 0, 0, 0, 2,\n",
       "       2, 1, 2, 0, 0, 1, 0, 0, 1, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 2, 1,\n",
       "       2, 2, 1, 2, 2, 0, 2, 2, 1, 0, 2, 0, 2, 0, 2, 0, 0, 2, 0, 0, 2, 0,\n",
       "       0, 0, 2, 0, 1, 2, 0, 2, 0, 2, 0, 2, 2, 0, 1, 2, 0, 0, 2, 2, 1, 2,\n",
       "       2, 0, 0, 2, 2, 0, 2, 0, 2, 2, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 2, 2,\n",
       "       1, 2, 0, 0, 2, 2, 0, 2, 2, 2, 0, 0, 0, 2, 0, 2, 1, 0, 0, 2, 0, 0,\n",
       "       0, 0, 2, 0, 0, 2, 2, 0, 0, 0, 2, 0, 1, 2, 2, 1, 0, 0, 2, 0, 0, 1,\n",
       "       0, 0, 2, 0, 0, 2, 2, 0, 0, 0, 1, 2, 1, 0, 1, 0, 2, 0, 0, 2, 0, 0,\n",
       "       0, 0, 0, 2, 0, 2, 2, 2, 1, 2, 0, 2, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0,\n",
       "       1, 0, 0, 0, 2, 0, 0, 0, 0, 2, 2, 0, 0, 2, 0, 0, 0, 2, 0, 2, 0, 2,\n",
       "       0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 1, 1, 1, 1, 2, 0, 0, 2, 2, 0, 0,\n",
       "       0, 0, 1, 2, 2, 2, 2, 0, 1, 0, 1, 1, 2, 1, 0, 0, 2, 0, 0, 0, 2, 0,\n",
       "       2, 2, 1, 1, 2, 0, 1, 0, 0, 0, 0, 2, 0, 0, 2, 0, 2, 0, 2, 0, 0, 2,\n",
       "       0, 0, 2, 2, 1, 0, 2, 2, 0, 0, 0, 2, 0, 0, 2, 2, 0, 2, 0, 0, 0, 0,\n",
       "       0, 1, 0, 0, 1, 1, 0, 2, 0, 2, 2, 2, 0, 0, 0, 0, 2, 0, 2, 0, 0, 0,\n",
       "       0, 0, 0, 0, 2, 2, 0, 0, 0, 2, 2, 2, 2, 0, 0, 0, 0, 2, 0, 1, 0, 1,\n",
       "       0, 0, 0, 0, 0, 0, 2, 2, 0, 2, 0, 0, 0, 2, 2, 2, 2, 2, 0, 0, 0, 2,\n",
       "       0, 0, 2, 2, 0, 0, 2, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 2, 0, 2, 2, 2,\n",
       "       2, 0, 0, 0, 2, 0, 1, 1, 1, 0, 0, 2, 0, 1, 0, 0, 2, 2, 0, 0, 0, 2,\n",
       "       2, 0, 0, 1, 0, 0, 0, 2, 0, 1, 0])"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lable = le.fit_transform(y)\n",
    "lable"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['No', 'Unknown', 'Yes'], dtype=object)"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "le.classes_  # 查看有哪些目标值"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.loc[:, 'Survived'] = lable"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Age</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Embarked</th>\n",
       "      <th>Survived</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>22.0</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>38.0</td>\n",
       "      <td>female</td>\n",
       "      <td>C</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>26.0</td>\n",
       "      <td>female</td>\n",
       "      <td>S</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>35.0</td>\n",
       "      <td>female</td>\n",
       "      <td>S</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>35.0</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    Age     Sex Embarked  Survived\n",
       "0  22.0    male        S         0\n",
       "1  38.0  female        C         2\n",
       "2  26.0  female        S         2\n",
       "3  35.0  female        S         2\n",
       "4  35.0    male        S         0"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 891 entries, 0 to 890\n",
      "Data columns (total 4 columns):\n",
      "Age         891 non-null float64\n",
      "Sex         891 non-null object\n",
      "Embarked    889 non-null object\n",
      "Survived    891 non-null int32\n",
      "dtypes: float64(1), int32(1), object(2)\n",
      "memory usage: 31.3+ KB\n"
     ]
    }
   ],
   "source": [
    "from sklearn.preprocessing import OrdinalEncoder\n",
    "#接口categories_对应LabelEncoder的接口classes_，一模一样的功能\n",
    "data_ = data.copy()\n",
    "data_.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Age</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Embarked</th>\n",
       "      <th>Survived</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>22.000000</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>38.000000</td>\n",
       "      <td>female</td>\n",
       "      <td>C</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>26.000000</td>\n",
       "      <td>female</td>\n",
       "      <td>S</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>35.000000</td>\n",
       "      <td>female</td>\n",
       "      <td>S</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>35.000000</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>29.699118</td>\n",
       "      <td>male</td>\n",
       "      <td>Q</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>54.000000</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>2.000000</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>27.000000</td>\n",
       "      <td>female</td>\n",
       "      <td>S</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>14.000000</td>\n",
       "      <td>female</td>\n",
       "      <td>C</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>4.000000</td>\n",
       "      <td>female</td>\n",
       "      <td>S</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>58.000000</td>\n",
       "      <td>female</td>\n",
       "      <td>S</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>20.000000</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>39.000000</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>14.000000</td>\n",
       "      <td>female</td>\n",
       "      <td>S</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>55.000000</td>\n",
       "      <td>female</td>\n",
       "      <td>S</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>2.000000</td>\n",
       "      <td>male</td>\n",
       "      <td>Q</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>29.699118</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>31.000000</td>\n",
       "      <td>female</td>\n",
       "      <td>S</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>29.699118</td>\n",
       "      <td>female</td>\n",
       "      <td>C</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>35.000000</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>34.000000</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>15.000000</td>\n",
       "      <td>female</td>\n",
       "      <td>Q</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>28.000000</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>8.000000</td>\n",
       "      <td>female</td>\n",
       "      <td>S</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>38.000000</td>\n",
       "      <td>female</td>\n",
       "      <td>S</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>29.699118</td>\n",
       "      <td>male</td>\n",
       "      <td>C</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>19.000000</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>29.699118</td>\n",
       "      <td>female</td>\n",
       "      <td>Q</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>29.699118</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>861</th>\n",
       "      <td>21.000000</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>862</th>\n",
       "      <td>48.000000</td>\n",
       "      <td>female</td>\n",
       "      <td>S</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>863</th>\n",
       "      <td>29.699118</td>\n",
       "      <td>female</td>\n",
       "      <td>S</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>864</th>\n",
       "      <td>24.000000</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>865</th>\n",
       "      <td>42.000000</td>\n",
       "      <td>female</td>\n",
       "      <td>S</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>866</th>\n",
       "      <td>27.000000</td>\n",
       "      <td>female</td>\n",
       "      <td>C</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>867</th>\n",
       "      <td>31.000000</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>868</th>\n",
       "      <td>29.699118</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>869</th>\n",
       "      <td>4.000000</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>870</th>\n",
       "      <td>26.000000</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>871</th>\n",
       "      <td>47.000000</td>\n",
       "      <td>female</td>\n",
       "      <td>S</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>872</th>\n",
       "      <td>33.000000</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>873</th>\n",
       "      <td>47.000000</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>874</th>\n",
       "      <td>28.000000</td>\n",
       "      <td>female</td>\n",
       "      <td>C</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>875</th>\n",
       "      <td>15.000000</td>\n",
       "      <td>female</td>\n",
       "      <td>C</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>876</th>\n",
       "      <td>20.000000</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>877</th>\n",
       "      <td>19.000000</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>878</th>\n",
       "      <td>29.699118</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>879</th>\n",
       "      <td>56.000000</td>\n",
       "      <td>female</td>\n",
       "      <td>C</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>880</th>\n",
       "      <td>25.000000</td>\n",
       "      <td>female</td>\n",
       "      <td>S</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>881</th>\n",
       "      <td>33.000000</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>882</th>\n",
       "      <td>22.000000</td>\n",
       "      <td>female</td>\n",
       "      <td>S</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>883</th>\n",
       "      <td>28.000000</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>884</th>\n",
       "      <td>25.000000</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>885</th>\n",
       "      <td>39.000000</td>\n",
       "      <td>female</td>\n",
       "      <td>Q</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>886</th>\n",
       "      <td>27.000000</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>887</th>\n",
       "      <td>19.000000</td>\n",
       "      <td>female</td>\n",
       "      <td>S</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>888</th>\n",
       "      <td>29.699118</td>\n",
       "      <td>female</td>\n",
       "      <td>S</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>889</th>\n",
       "      <td>26.000000</td>\n",
       "      <td>male</td>\n",
       "      <td>C</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>890</th>\n",
       "      <td>32.000000</td>\n",
       "      <td>male</td>\n",
       "      <td>Q</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>891 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "           Age     Sex Embarked  Survived\n",
       "0    22.000000    male        S         0\n",
       "1    38.000000  female        C         2\n",
       "2    26.000000  female        S         2\n",
       "3    35.000000  female        S         2\n",
       "4    35.000000    male        S         0\n",
       "5    29.699118    male        Q         0\n",
       "6    54.000000    male        S         0\n",
       "7     2.000000    male        S         0\n",
       "8    27.000000  female        S         2\n",
       "9    14.000000  female        C         2\n",
       "10    4.000000  female        S         1\n",
       "11   58.000000  female        S         2\n",
       "12   20.000000    male        S         0\n",
       "13   39.000000    male        S         0\n",
       "14   14.000000  female        S         0\n",
       "15   55.000000  female        S         1\n",
       "16    2.000000    male        Q         0\n",
       "17   29.699118    male        S         2\n",
       "18   31.000000  female        S         0\n",
       "19   29.699118  female        C         2\n",
       "20   35.000000    male        S         1\n",
       "21   34.000000    male        S         2\n",
       "22   15.000000  female        Q         2\n",
       "23   28.000000    male        S         2\n",
       "24    8.000000  female        S         0\n",
       "25   38.000000  female        S         1\n",
       "26   29.699118    male        C         0\n",
       "27   19.000000    male        S         0\n",
       "28   29.699118  female        Q         2\n",
       "29   29.699118    male        S         0\n",
       "..         ...     ...      ...       ...\n",
       "861  21.000000    male        S         0\n",
       "862  48.000000  female        S         2\n",
       "863  29.699118  female        S         0\n",
       "864  24.000000    male        S         1\n",
       "865  42.000000  female        S         1\n",
       "866  27.000000  female        C         1\n",
       "867  31.000000    male        S         0\n",
       "868  29.699118    male        S         0\n",
       "869   4.000000    male        S         2\n",
       "870  26.000000    male        S         0\n",
       "871  47.000000  female        S         1\n",
       "872  33.000000    male        S         0\n",
       "873  47.000000    male        S         0\n",
       "874  28.000000  female        C         2\n",
       "875  15.000000  female        C         2\n",
       "876  20.000000    male        S         0\n",
       "877  19.000000    male        S         0\n",
       "878  29.699118    male        S         0\n",
       "879  56.000000  female        C         2\n",
       "880  25.000000  female        S         2\n",
       "881  33.000000    male        S         0\n",
       "882  22.000000  female        S         0\n",
       "883  28.000000    male        S         1\n",
       "884  25.000000    male        S         0\n",
       "885  39.000000  female        Q         0\n",
       "886  27.000000    male        S         0\n",
       "887  19.000000  female        S         2\n",
       "888  29.699118  female        S         0\n",
       "889  26.000000    male        C         1\n",
       "890  32.000000    male        Q         0\n",
       "\n",
       "[891 rows x 4 columns]"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import numpy as np\n",
    "# si = SimpleImputer(missing_values=np.nan, strategy=\"most_frequent\")\n",
    "# data_=si.fit_transform(data_)\n",
    "data_.loc[:,'Embarked'].fillna(method=\"ffill\",inplace=True)\n",
    "data_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[array([0., 1.]), array([0., 1., 2.])]"
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "OrdinalEncoder().fit(data_.iloc[:,1:-1]).categories_\n",
    "#LabelEncoder().fit(data_.iloc[:,1]).classes_\n",
    "# 两者区别似乎仅仅是label只能处理一维标签，针对目标值，ordinal则可以多维同时处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Age</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Embarked</th>\n",
       "      <th>Survived</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>22.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>38.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>26.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>35.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>35.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    Age  Sex  Embarked  Survived\n",
       "0  22.0  1.0       2.0         0\n",
       "1  38.0  0.0       0.0         2\n",
       "2  26.0  0.0       2.0         2\n",
       "3  35.0  0.0       2.0         2\n",
       "4  35.0  1.0       2.0         0"
      ]
     },
     "execution_count": 58,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_.iloc[:,1:-1] = OrdinalEncoder().fit_transform(data_.iloc[:,1:-1])  # 这种做法是不合适的\n",
    "data_.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 以上除了label的功能似乎有用，另外一个没用感觉\n",
    "# one-hot编码快速制作"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Age</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Embarked</th>\n",
       "      <th>Survived</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>22.0</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>38.0</td>\n",
       "      <td>female</td>\n",
       "      <td>C</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>26.0</td>\n",
       "      <td>female</td>\n",
       "      <td>S</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>35.0</td>\n",
       "      <td>female</td>\n",
       "      <td>S</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>35.0</td>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    Age     Sex Embarked  Survived\n",
       "0  22.0    male        S         0\n",
       "1  38.0  female        C         2\n",
       "2  26.0  female        S         2\n",
       "3  35.0  female        S         2\n",
       "4  35.0    male        S         0"
      ]
     },
     "execution_count": 62,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Sex</th>\n",
       "      <th>Embarked</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>female</td>\n",
       "      <td>C</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>female</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>female</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>male</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      Sex Embarked\n",
       "0    male        S\n",
       "1  female        C\n",
       "2  female        S\n",
       "3  female        S\n",
       "4    male        S"
      ]
     },
     "execution_count": 109,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.preprocessing import OneHotEncoder\n",
    "data.loc[:,'Embarked'].fillna(method=\"ffill\",inplace=True)\n",
    "X = data.iloc[:,1:-1]\n",
    "X.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "metadata": {},
   "outputs": [],
   "source": [
    "ohe = OneHotEncoder(categories='auto')  # 可以传递参数sparse=False 生成非稀疏矩阵\n",
    "X = ohe.fit_transform(X)  # 得到的是稀疏矩阵，需要使用toarray转换\n",
    "X = X.toarray()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['x0_female', 'x0_male', 'x1_C', 'x1_Q', 'x1_S'], dtype=object)"
      ]
     },
     "execution_count": 111,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ohe.get_feature_names()  # 获取转换后的特征名称"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>female</th>\n",
       "      <th>male</th>\n",
       "      <th>c</th>\n",
       "      <th>q</th>\n",
       "      <th>s</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   female  male    c    q    s\n",
       "0     0.0   1.0  0.0  0.0  1.0\n",
       "1     1.0   0.0  1.0  0.0  0.0\n",
       "2     1.0   0.0  0.0  0.0  1.0\n",
       "3     1.0   0.0  0.0  0.0  1.0\n",
       "4     0.0   1.0  0.0  0.0  1.0"
      ]
     },
     "execution_count": 116,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 合并数据\n",
    "X = pd.DataFrame(X, columns=['female','male','c','q','s'])\n",
    "X.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 118,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Age</th>\n",
       "      <th>Survived</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>22.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>38.0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>26.0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>35.0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    Age  Survived\n",
       "0  22.0         0\n",
       "1  38.0         2\n",
       "2  26.0         2\n",
       "3  35.0         2\n",
       "4  35.0         0"
      ]
     },
     "execution_count": 118,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.drop(['Sex', 'Embarked'], axis=1, inplace=True)\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Age</th>\n",
       "      <th>Survived</th>\n",
       "      <th>female</th>\n",
       "      <th>male</th>\n",
       "      <th>c</th>\n",
       "      <th>q</th>\n",
       "      <th>s</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>22.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>38.0</td>\n",
       "      <td>2</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>26.0</td>\n",
       "      <td>2</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>35.0</td>\n",
       "      <td>2</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    Age  Survived  female  male    c    q    s\n",
       "0  22.0         0     0.0   1.0  0.0  0.0  1.0\n",
       "1  38.0         2     1.0   0.0  1.0  0.0  0.0\n",
       "2  26.0         2     1.0   0.0  0.0  0.0  1.0\n",
       "3  35.0         2     1.0   0.0  0.0  0.0  1.0\n",
       "4  35.0         0     0.0   1.0  0.0  0.0  1.0"
      ]
     },
     "execution_count": 119,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = pd.concat([data, X],axis=1)\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 123,
   "metadata": {},
   "outputs": [],
   "source": [
    "data.columns = [\"Age\",\"Survived\",\"Female\",\"Male\",\"Embarked_C\",\"Embarked_Q\",\"Embarked_S\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 124,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Age</th>\n",
       "      <th>Survived</th>\n",
       "      <th>Female</th>\n",
       "      <th>Male</th>\n",
       "      <th>Embarked_C</th>\n",
       "      <th>Embarked_Q</th>\n",
       "      <th>Embarked_S</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>22.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>38.0</td>\n",
       "      <td>2</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>26.0</td>\n",
       "      <td>2</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>35.0</td>\n",
       "      <td>2</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    Age  Survived  Female  Male  Embarked_C  Embarked_Q  Embarked_S\n",
       "0  22.0         0     0.0   1.0         0.0         0.0         1.0\n",
       "1  38.0         2     1.0   0.0         1.0         0.0         0.0\n",
       "2  26.0         2     1.0   0.0         0.0         0.0         1.0\n",
       "3  35.0         2     1.0   0.0         0.0         0.0         1.0\n",
       "4  35.0         0     0.0   1.0         0.0         0.0         1.0"
      ]
     },
     "execution_count": 124,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 125,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Age</th>\n",
       "      <th>Survived</th>\n",
       "      <th>Female</th>\n",
       "      <th>Male</th>\n",
       "      <th>Embarked_C</th>\n",
       "      <th>Embarked_Q</th>\n",
       "      <th>Embarked_S</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>22.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>38.0</td>\n",
       "      <td>2</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>26.0</td>\n",
       "      <td>2</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>35.0</td>\n",
       "      <td>2</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    Age  Survived  Female  Male  Embarked_C  Embarked_Q  Embarked_S\n",
       "0  22.0         0     0.0   1.0         0.0         0.0         1.0\n",
       "1  38.0         2     1.0   0.0         1.0         0.0         0.0\n",
       "2  26.0         2     1.0   0.0         0.0         0.0         1.0\n",
       "3  35.0         2     1.0   0.0         0.0         0.0         1.0\n",
       "4  35.0         0     0.0   1.0         0.0         0.0         1.0"
      ]
     },
     "execution_count": 125,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_2 = data.copy()\n",
    "data_2.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 连续特征二值化操作"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 144,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Age</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>22.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>38.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>26.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>35.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>35.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    Age\n",
       "0  22.0\n",
       "1  38.0\n",
       "2  26.0\n",
       "3  35.0\n",
       "4  35.0"
      ]
     },
     "execution_count": 144,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 这个有用，如豆瓣电影根据评分分为推荐和不推荐两类，当时使用mapping方法实现的\n",
    "from sklearn.preprocessing import Binarizer\n",
    "# 取出年龄进行二值化\n",
    "age = data_2.loc[:, ['Age']]\n",
    "age.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 145,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.]])"
      ]
     },
     "execution_count": 145,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "b = Binarizer(threshold=30)\n",
    "b.fit(age)\n",
    "age = b.transform(age)\n",
    "age[0:5,:]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 连续特征多值化"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 150,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[0.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [2.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [2.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [2.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [2.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [2.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [2.],\n",
       "       [1.],\n",
       "       [2.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [2.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [2.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [2.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [2.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [2.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [2.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [2.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [2.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [2.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [2.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [2.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [2.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [2.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [2.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [2.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [2.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [2.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [2.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [2.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [2.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [2.],\n",
       "       [2.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [2.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [2.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [2.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [2.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [2.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [2.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [2.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [2.],\n",
       "       [2.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [2.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [2.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [2.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [2.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [2.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [2.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [2.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [2.],\n",
       "       [1.],\n",
       "       [2.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [2.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [2.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [2.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.],\n",
       "       [0.],\n",
       "       [1.]])"
      ]
     },
     "execution_count": 150,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 这里进行的分箱操作，无法指定区间为某一个类别\n",
    "from sklearn.preprocessing import KBinsDiscretizer\n",
    "X = data.iloc[:,0].values.reshape(-1,1)\n",
    "# ordinal的方式不会进行onehot编码，是在原列上替换为0，1，2数字，就像上边的ordinalencoder一样\n",
    "est = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')\n",
    "est.fit_transform(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 151,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{0.0, 1.0, 2.0}"
      ]
     },
     "execution_count": 151,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#查看转换后分的箱：变成了一列中的三箱\n",
    "set(est.fit_transform(X).ravel())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 148,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[1., 0., 0.],\n",
       "       [0., 1., 0.],\n",
       "       [1., 0., 0.],\n",
       "       ...,\n",
       "       [0., 1., 0.],\n",
       "       [1., 0., 0.],\n",
       "       [0., 1., 0.]])"
      ]
     },
     "execution_count": 148,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "est = KBinsDiscretizer(n_bins=3, encode='onehot', strategy='uniform')\n",
    "#查看转换后分的箱：变成了哑变量\n",
    "est.fit_transform(X).toarray()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
